Imports and installations

InĀ [Ā ]:
#necessary downloads

!apt install --allow-change-held-packages libcudnn8=8.6.0.163-1+cuda11.8
!pip install -U tensorflow_text tensorflow tensorflow_datasets
!pip install einops
#!pip uninstall -y tensorflow estimator keras
InĀ [2]:
#necessary imports

import concurrent.futures
import collections
import dataclasses
import hashlib
import itertools
import json
import math
import os
import pathlib
import random
import re
import string
import time
import urllib.request
import einops
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import requests
import tqdm
from google.colab import files

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_datasets as tfds

Data importing and preprocessing

InĀ [3]:
def flickr8k(path='flickr8k'):
  path = pathlib.Path(path)

  if len(list(path.rglob('*'))) < 16197:
    tf.keras.utils.get_file(
        origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip',
        cache_dir='.',
        cache_subdir=path,
        extract=True)
    tf.keras.utils.get_file(
        origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip',
        cache_dir='.',
        cache_subdir=path,
        extract=True)

  captions = (path/"Flickr8k.token.txt").read_text().splitlines()
  captions = (line.split('\t') for line in captions)
  captions = ((fname.split('#')[0], caption) for (fname, caption) in captions)

  cap_dict = collections.defaultdict(list)
  for fname, cap in captions:
    cap_dict[fname].append(cap)

  train_files = (path/'Flickr_8k.trainImages.txt').read_text().splitlines()
  train_captions = [(str(path/'Flicker8k_Dataset'/fname), cap_dict[fname]) for fname in train_files]

  test_files = (path/'Flickr_8k.testImages.txt').read_text().splitlines()
  test_captions = [(str(path/'Flicker8k_Dataset'/fname), cap_dict[fname]) for fname in test_files]

  train_ds = tf.data.experimental.from_list(train_captions)
  test_ds = tf.data.experimental.from_list(test_captions)

  return train_ds, test_ds
InĀ [4]:
#Loading our dataset
train_raw, test_raw = flickr8k()
Downloading data from https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
1115419746/1115419746 [==============================] - 16s 0us/step
Downloading data from https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
2340801/2340801 [==============================] - 0s 0us/step

Image feature extractor Mobilenet¶

InĀ [10]:
IMAGE_SHAPE=(224, 224, 3)
mobilenet = tf.keras.applications.MobileNetV3Small(
    input_shape=IMAGE_SHAPE,
    include_top=False,
    include_preprocessing=True)
mobilenet.trainable=False

def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SHAPE[:-1])
    return img

Text tokenizer

InĀ [12]:
def standardize(s):
  s = tf.strings.lower(s)
  s = tf.strings.regex_replace(s, f'[{re.escape(string.punctuation)}]', '')
  s = tf.strings.join(['[START]', s, '[END]'], separator=' ')
  return s
InĀ [13]:
# Use the top 5000 words for a vocabulary.
vocabulary_size = 5000
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=standardize,
    ragged=True)
# Learn the vocabulary from the caption data.
InĀ [14]:
tokenizer.adapt(train_raw.map(lambda fp,txt: txt).unbatch().batch(1024))
InĀ [17]:
# Create mappings for words to indices and indices to words.
word_to_index = tf.keras.layers.StringLookup(
    mask_token="",
    vocabulary=tokenizer.get_vocabulary())
index_to_word = tf.keras.layers.StringLookup(
    mask_token="",
    vocabulary=tokenizer.get_vocabulary(),
    invert=True)

Prepare our datasets¶

InĀ [20]:
def match_shapes(images, captions):
  caption_shape = einops.parse_shape(captions, 'b c')
  captions = einops.rearrange(captions, 'b c -> (b c)')
  images = einops.repeat(
      images, 'b ... -> (b c) ...',
      c = caption_shape['c'])
  return images, captions
InĀ [21]:
for ex_paths, ex_captions in train_raw.batch(32).take(1):
  break

print('image paths:', ex_paths.shape)
print('captions:', ex_captions.shape)
print()

ex_paths, ex_captions = match_shapes(images=ex_paths, captions=ex_captions)

print('image_paths:', ex_paths.shape)
print('captions:', ex_captions.shape)
image paths: (32,)
captions: (32, 5)

image_paths: (160,)
captions: (160,)

To be compatible with keras training the dataset should contain (inputs, labels) pairs. For text generation the tokens are both an input and the labels, shifted by one step. This function will convert an (images, texts) pair to an ((images, input_tokens), label_tokens) pair:

InĀ [22]:
def prepare_txt(imgs, txts):
  tokens = tokenizer(txts)

  input_tokens = tokens[..., :-1]
  label_tokens = tokens[..., 1:]
  return (imgs, input_tokens), label_tokens

This function adds operations to a dataset. The steps are:

  1. Load the images (and ignore images that fail to load).
  2. Replicate images to match the number of captions.
  3. Shuffle and rebatch the image, caption pairs.
  4. Tokenize the text, shift the tokens and add label_tokens.
  5. Convert the text from a RaggedTensor representation to padded dense Tensor representation.
InĀ [23]:
def prepare_dataset(ds, tokenizer, batch_size=32, shuffle_buffer=1000):
  # Load the images and make batches.
  ds = (ds
        .shuffle(10000)
        .map(lambda path, caption: (load_image(path), caption))
        .apply(tf.data.experimental.ignore_errors())
        .batch(batch_size))

  def to_tensor(inputs, labels):
    (images, in_tok), out_tok = inputs, labels
    return (images, in_tok.to_tensor()), out_tok.to_tensor()

  return (ds
          .map(match_shapes, tf.data.AUTOTUNE)
          .unbatch()
          .shuffle(shuffle_buffer)
          .batch(batch_size)
          .map(prepare_txt, tf.data.AUTOTUNE)
          .map(to_tensor, tf.data.AUTOTUNE)
          )
InĀ [24]:
train_ds = prepare_dataset(train_raw, tokenizer)
train_ds.element_spec
WARNING:tensorflow:From <ipython-input-23-03f5d7fa769a>:6: ignore_errors (from tensorflow.python.data.experimental.ops.error_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.
Out[24]:
((TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None),
  TensorSpec(shape=(None, None), dtype=tf.int64, name=None)),
 TensorSpec(shape=(None, None), dtype=tf.int64, name=None))
InĀ [25]:
test_ds = prepare_dataset(test_raw, tokenizer)
test_ds.element_spec
Out[25]:
((TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None),
  TensorSpec(shape=(None, None), dtype=tf.int64, name=None)),
 TensorSpec(shape=(None, None), dtype=tf.int64, name=None))

Cache the image features¶

InĀ [26]:
def save_dataset(ds, save_path, image_model, tokenizer, shards=10, batch_size=32):
  # Load the images and make batches.
  ds = (ds
        .map(lambda path, caption: (load_image(path), caption))
        .apply(tf.data.experimental.ignore_errors())
        .batch(batch_size))

  # Run the feature extractor on each batch
  # Don't do this in a .map, because tf.data runs on the CPU.
  def gen():
    for (images, captions) in tqdm.tqdm(ds):
      feature_maps = image_model(images)

      feature_maps, captions = match_shapes(feature_maps, captions)
      yield feature_maps, captions

  # Wrap the generator in a new tf.data.Dataset.
  new_ds = tf.data.Dataset.from_generator(
      gen,
      output_signature=(
          tf.TensorSpec(shape=image_model.output_shape),
          tf.TensorSpec(shape=(None,), dtype=tf.string)))

  # Apply the tokenization
  new_ds = (new_ds
            .map(prepare_txt, tf.data.AUTOTUNE)
            .unbatch()
            .shuffle(1000))

  # Save the dataset into shard files.
  def shard_func(i, item):
    return i % shards
  new_ds.enumerate().save(save_path, shard_func=shard_func)

def load_dataset(save_path, batch_size=32, shuffle=1000, cycle_length=2):
  def custom_reader_func(datasets):
    datasets = datasets.shuffle(1000)
    return datasets.interleave(lambda x: x, cycle_length=cycle_length)

  ds = tf.data.Dataset.load(save_path, reader_func=custom_reader_func)

  def drop_index(i, x):
    return x

  ds = (ds
        .map(drop_index, tf.data.AUTOTUNE)
        .shuffle(shuffle)
        .padded_batch(batch_size)
        .prefetch(tf.data.AUTOTUNE))
  return ds
InĀ [27]:
save_dataset(train_raw, 'train_cache', mobilenet, tokenizer)
save_dataset(test_raw, 'test_cache', mobilenet, tokenizer)
188it [02:42,  1.16it/s]
32it [00:41,  1.29s/it]

Data ready for training¶

After those preprocessing steps, here are the datasets:

InĀ [28]:
train_ds = load_dataset('train_cache')
test_ds = load_dataset('test_cache')
InĀ [29]:
train_ds.element_spec
Out[29]:
((TensorSpec(shape=(None, 7, 7, 576), dtype=tf.float32, name=None),
  TensorSpec(shape=(None, None), dtype=tf.int64, name=None)),
 TensorSpec(shape=(None, None), dtype=tf.int64, name=None))

The dataset now returns (input, label) pairs suitable for training with keras. The inputs are (images, input_tokens) pairs. The images have been processed with the feature-extractor model. For each location in the input_tokens the model looks at the text so far and tries to predict the next which is lined up at the same location in the labels.

InĀ [45]:
for (inputs, ex_labels) in train_ds.take(1):
  (ex_img, ex_in_tok) = inputs

print(ex_img.shape)
print(ex_in_tok.shape)
print(ex_labels.shape)
(32, 7, 7, 576)
(32, 23)
(32, 23)

The input tokens and the labels are the same, just shifted by 1 step:

InĀ [46]:
print(ex_in_tok[0].numpy())
print(ex_labels[0].numpy())
[   3  185    8   29  273   12   49    7    6  276   13    2  394 3139
   22    2   97    0    0    0    0    0    0]
[ 185    8   29  273   12   49    7    6  276   13    2  394 3139   22
    2   97    4    0    0    0    0    0    0]
InĀ [47]:
train_ds.take(1)
Out[47]:
<_TakeDataset element_spec=((TensorSpec(shape=(None, 7, 7, 576), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None)), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

A Transformer decoder model¶

No description has been provided for this image

The model will be implemented in three main parts:

  1. Input - The token embedding and positional encoding (SeqEmbedding).
  2. Decoder - A stack of transformer decoder layers (DecoderLayer) where each contains:
    1. A causal self attention later (CausalSelfAttention), where each output location can attend to the output so far.
    2. A cross attention layer (CrossAttention) where each output location can attend to the input image.
    3. A feed forward network (FeedForward) layer which further processes each output location independently.
  3. Output - A multiclass-classification over the output vocabulary.

Input¶

InĀ [48]:
class SeqEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, max_length, depth):
    super().__init__()
    self.pos_embedding = tf.keras.layers.Embedding(input_dim=max_length, output_dim=depth)

    self.token_embedding = tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=depth,
        mask_zero=True)

    self.add = tf.keras.layers.Add()

  def call(self, seq):
    seq = self.token_embedding(seq) # (batch, seq, depth)

    x = tf.range(tf.shape(seq)[1])  # (seq)
    x = x[tf.newaxis, :]  # (1, seq)
    x = self.pos_embedding(x)  # (1, seq, depth)

    return self.add([seq,x])

Decoder¶

InĀ [49]:
class CausalSelfAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    # Use Add instead of + so the keras mask propagates through.
    self.add = tf.keras.layers.Add()
    self.layernorm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    attn = self.mha(query=x, value=x,
                    use_causal_mask=True)
    x = self.add([x, attn])
    return self.layernorm(x)

The CrossAttention layer is below. Note the use of return_attention_scores.

InĀ [50]:
class CrossAttention(tf.keras.layers.Layer):
  def __init__(self,**kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.add = tf.keras.layers.Add()
    self.layernorm = tf.keras.layers.LayerNormalization()

  def call(self, x, y, **kwargs):
    attn, attention_scores = self.mha(
             query=x, value=y,
             return_attention_scores=True)

    self.last_attention_scores = attention_scores

    x = self.add([x, attn])
    return self.layernorm(x)

The FeedForward layer is below. Remember that a layers.Dense layer is applied to the last axis of the input. The input will have a shape of (batch, sequence, channels), so it automatically applies pointwise across the batch and sequence axes.

InĀ [51]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, units, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(units=2*units, activation='relu'),
        tf.keras.layers.Dense(units=units),
        tf.keras.layers.Dropout(rate=dropout_rate),
    ])

    self.layernorm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = x + self.seq(x)
    return self.layernorm(x)

Next arrange these three layers into a larger DecoderLayer. Each decoder layer applies the three smaller layers in sequence. After each sublayer the shape of out_seq is (batch, sequence, channels). The decoder layer also returns the attention_scores for later visualizations.

InĀ [52]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, units, num_heads=1, dropout_rate=0.1):
    super().__init__()

    self.self_attention = CausalSelfAttention(num_heads=num_heads,
                                              key_dim=units,
                                              dropout=dropout_rate)
    self.cross_attention = CrossAttention(num_heads=num_heads,
                                          key_dim=units,
                                          dropout=dropout_rate)
    self.ff = FeedForward(units=units, dropout_rate=dropout_rate)


  def call(self, inputs, training=False):
    in_seq, out_seq = inputs

    # Text input
    out_seq = self.self_attention(out_seq)

    out_seq = self.cross_attention(out_seq, in_seq)

    self.last_attention_scores = self.cross_attention.last_attention_scores

    out_seq = self.ff(out_seq)

    return out_seq

Output¶

  1. Handle bad tokens:

  2. Smart initialization:

InĀ [53]:
#@title
class TokenOutput(tf.keras.layers.Layer):
  def __init__(self, tokenizer, banned_tokens=('', '[UNK]', '[START]'), **kwargs):
    super().__init__()

    self.dense = tf.keras.layers.Dense(
        units=tokenizer.vocabulary_size(), **kwargs)
    self.tokenizer = tokenizer
    self.banned_tokens = banned_tokens

    self.bias = None

  def adapt(self, ds):
    counts = collections.Counter()
    vocab_dict = {name: id
                  for id, name in enumerate(self.tokenizer.get_vocabulary())}

    for tokens in tqdm.tqdm(ds):
      counts.update(tokens.numpy().flatten())

    counts_arr = np.zeros(shape=(self.tokenizer.vocabulary_size(),))
    counts_arr[np.array(list(counts.keys()), dtype=np.int32)] = list(counts.values())

    counts_arr = counts_arr[:]
    for token in self.banned_tokens:
      counts_arr[vocab_dict[token]] = 0

    total = counts_arr.sum()
    p = counts_arr/total
    p[counts_arr==0] = 1.0
    log_p = np.log(p)  # log(1) == 0

    entropy = -(log_p*p).sum()

    print()
    print(f"Uniform entropy: {np.log(self.tokenizer.vocabulary_size()):0.2f}")
    print(f"Marginal entropy: {entropy:0.2f}")

    self.bias = log_p
    self.bias[counts_arr==0] = -1e9

  def call(self, x):
    x = self.dense(x)
    # TODO(b/250038731): Fix this.
    # An Add layer doesn't work because of the different shapes.
    # This clears the mask, that's okay because it prevents keras from rescaling
    # the losses.
    return x + self.bias

The smart initialization will significantly reduce the initial loss:

InĀ [54]:
output_layer = TokenOutput(tokenizer, banned_tokens=('', '[UNK]', '[START]'))
# This might run a little faster if the dataset didn't also have to load the image data.
output_layer.adapt(train_ds.map(lambda inputs, labels: labels))
100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 938/938 [00:17<00:00, 52.35it/s]
Uniform entropy: 8.52
Marginal entropy: 5.29

Build the model¶

To build the model, you need to combine several parts:

  1. The image feature_extractor and the text tokenizer and.
  2. The seq_embedding layer, to convert batches of token-IDs to vectors (batch, sequence, channels).
  3. The stack of DecoderLayers layers that will process the text and image data.
  4. The output_layer which returns a pointwise prediction of what the next word should be.
InĀ [55]:
class Captioner(tf.keras.Model):
  @classmethod
  def add_method(cls, fun):
    setattr(cls, fun.__name__, fun)
    return fun

  def __init__(self, tokenizer, feature_extractor, output_layer, num_layers=1,
               units=256, max_length=50, num_heads=1, dropout_rate=0.1):
    super().__init__()
    self.feature_extractor = feature_extractor
    self.tokenizer = tokenizer
    self.word_to_index = tf.keras.layers.StringLookup(
        mask_token="",
        vocabulary=tokenizer.get_vocabulary())
    self.index_to_word = tf.keras.layers.StringLookup(
        mask_token="",
        vocabulary=tokenizer.get_vocabulary(),
        invert=True)

    self.seq_embedding = SeqEmbedding(
        vocab_size=tokenizer.vocabulary_size(),
        depth=units,
        max_length=max_length)

    self.decoder_layers = [
        DecoderLayer(units, num_heads=num_heads, dropout_rate=dropout_rate)
        for n in range(num_layers)]

    self.output_layer = output_layer
InĀ [56]:
  @Captioner.add_method
  def call(self, inputs):
    image, txt = inputs

    if image.shape[-1] == 3:
      # Apply the feature-extractor, if you get an RGB image.
      image = self.feature_extractor(image)

    # Flatten the feature map
    image = einops.rearrange(image, 'b h w c -> b (h w) c')


    if txt.dtype == tf.string:
      # Apply the tokenizer if you get string inputs.
      txt = tokenizer(txt)

    txt = self.seq_embedding(txt)

    # Look at the image
    for dec_layer in self.decoder_layers:
      txt = dec_layer(inputs=(image, txt))

    txt = self.output_layer(txt)

    return txt
InĀ [57]:
model = Captioner(tokenizer, feature_extractor=mobilenet, output_layer=output_layer,
                  units=256, dropout_rate=0.5, num_layers=2, num_heads=2)
InĀ [58]:
@Captioner.add_method
def simple_gen(self, image, temperature=1):
  initial = self.word_to_index([['[START]']]) # (batch, sequence)
  img_features = self.feature_extractor(image[tf.newaxis, ...])

  tokens = initial # (batch, sequence)
  for n in range(50):
    preds = self((img_features, tokens)).numpy()  # (batch, sequence, vocab)
    preds = preds[:,-1, :]  #(batch, vocab)
    if temperature==0:
        next = tf.argmax(preds, axis=-1)[:, tf.newaxis]  # (batch, 1)
    else:
        next = tf.random.categorical(preds/temperature, num_samples=1)  # (batch, 1)
    tokens = tf.concat([tokens, next], axis=1) # (batch, sequence)

    if next[0] == self.word_to_index('[END]'):
      break
  words = index_to_word(tokens[0, 1:-1])
  result = tf.strings.reduce_join(words, axis=-1, separator=' ')
  return result.numpy().decode()

Train¶

Losses and metrics¶

Here's an implementation of a masked loss and accuracy:

When calculating the mask for the loss, note the loss < 1e8. This term discards the artificial, impossibly high losses for the banned_tokens.

InĀ [60]:
def masked_loss(labels, preds):
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, preds)

  mask = (labels != 0) & (loss < 1e8)
  mask = tf.cast(mask, loss.dtype)

  loss = loss*mask
  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss

def masked_acc(labels, preds):
  mask = tf.cast(labels!=0, tf.float32)
  preds = tf.argmax(preds, axis=-1)
  labels = tf.cast(labels, tf.int64)
  match = tf.cast(preds == labels, mask.dtype)
  acc = tf.reduce_sum(match*mask)/tf.reduce_sum(mask)
  return acc

Callbacks¶

For feedback during training setup a keras.callbacks.Callback to generate some captions for the surfer image at the end of each epoch.

InĀ [68]:
class GenerateText(tf.keras.callbacks.Callback):
  def __init__(self):
    #image_url = 'https://images.pexels.com/photos/7092613/pexels-photo-7092613.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1'
    image_path = "/content/Profteachingexample.jpg"
    self.image = load_image(image_path)

  def on_epoch_end(self, epochs=None, logs=None):
    print()
    print()
    for t in (0.0, 0.5, 1.0):
      result = self.model.simple_gen(self.image, temperature=t)
      print(result)
    print()
InĀ [70]:
callbacks = [
    GenerateText(),
    tf.keras.callbacks.EarlyStopping(
        patience=3, restore_best_weights=True)]

Train¶

Configure and execute the training.

InĀ [71]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
           loss=masked_loss,
           metrics=[masked_acc])
InĀ [72]:
history = model.fit(
    train_ds.repeat(),
    steps_per_epoch=100,
    validation_data=test_ds.repeat(),
    validation_steps=20,
    epochs=100,
    callbacks=callbacks)
Epoch 1/100
100/100 [==============================] - ETA: 0s - loss: 5.0138 - masked_acc: 0.2003

a man in a a man in a
a man in a woman a red in a man
two purple in the with and teammates stand

100/100 [==============================] - 83s 738ms/step - loss: 5.0138 - masked_acc: 0.2003 - val_loss: 4.7083 - val_masked_acc: 0.2366
Epoch 2/100
100/100 [==============================] - ETA: 0s - loss: 4.6523 - masked_acc: 0.2535

a man in a red and a red
a man is is of a rock
van man climbs a green handicapped the women

100/100 [==============================] - 70s 696ms/step - loss: 4.6523 - masked_acc: 0.2535 - val_loss: 4.4058 - val_masked_acc: 0.2643
Epoch 3/100
100/100 [==============================] - ETA: 0s - loss: 4.4321 - masked_acc: 0.2752

a man in a red and a red and a man
a woman and a large shirt and a blue
a puppies outstretched kayaking shaking object and glacier dressed with two

100/100 [==============================] - 76s 765ms/step - loss: 4.4321 - masked_acc: 0.2752 - val_loss: 4.2230 - val_masked_acc: 0.2837
Epoch 4/100
100/100 [==============================] - ETA: 0s - loss: 4.2665 - masked_acc: 0.2901

a man in a red shirt and a woman in a man
a man is standing on a blue and a boat
a boy jumping a little girl jacket women near to orange shirt

100/100 [==============================] - 73s 731ms/step - loss: 4.2665 - masked_acc: 0.2901 - val_loss: 4.0793 - val_masked_acc: 0.2886
Epoch 5/100
100/100 [==============================] - ETA: 0s - loss: 4.1226 - masked_acc: 0.3062

a man in a red shirt and a woman in a red and a man and a man
a man wearing a pink and a yellow shirt and white dog
a man crosscountry wearing and and sides

100/100 [==============================] - 73s 729ms/step - loss: 4.1226 - masked_acc: 0.3062 - val_loss: 3.8932 - val_masked_acc: 0.3214
Epoch 6/100
100/100 [==============================] - ETA: 0s - loss: 4.0149 - masked_acc: 0.3151

a man in a woman and a woman in a woman in a man
a man is standing in a white and blue and a man in a black and white shirt
a man

100/100 [==============================] - 71s 712ms/step - loss: 4.0149 - masked_acc: 0.3151 - val_loss: 3.9269 - val_masked_acc: 0.3162
Epoch 7/100
100/100 [==============================] - ETA: 0s - loss: 3.9492 - masked_acc: 0.3219

a man in a black and a black and a man in a man and a man and a black and a black and a man and a man and a man and a man is standing in a man and a man and a man in a man
a man in a yellow shirt and a red shirt and a woman and a group of a red shirt
front of a man as a woman is pink back

100/100 [==============================] - 84s 845ms/step - loss: 3.9492 - masked_acc: 0.3219 - val_loss: 3.7461 - val_masked_acc: 0.3290
Epoch 8/100
100/100 [==============================] - ETA: 0s - loss: 3.8674 - masked_acc: 0.3256

a man in a red shirt and a woman in a red shirt and a white shirt
a woman in a the shirt is standing on a red shirt and a people
a young trail in the dress is in the player for the trick

100/100 [==============================] - 74s 738ms/step - loss: 3.8674 - masked_acc: 0.3256 - val_loss: 3.7841 - val_masked_acc: 0.3261
Epoch 9/100
100/100 [==============================] - ETA: 0s - loss: 3.7917 - masked_acc: 0.3327

a man in a red shirt and a woman in a red shirt and a white shirt and a red shirt and a man and a man and a man and a woman and a white shirt and white and a man and a woman and a man in
a group of people are playing in a trampoline
the four mother are are players at the takes night

100/100 [==============================] - 76s 764ms/step - loss: 3.7917 - masked_acc: 0.3327 - val_loss: 3.6401 - val_masked_acc: 0.3435
Epoch 10/100
100/100 [==============================] - ETA: 0s - loss: 3.7249 - masked_acc: 0.3418

a man in a black shirt and a woman in a black and a red shirt and white shirt and a woman in a woman in a man and a man in a black and white shirt and white and a woman in a woman with a woman in
a woman in a blue shirt and black shirt is standing in front of a blue shirt
two girls one woman standing on his bmx crowd of a man posts

100/100 [==============================] - 80s 806ms/step - loss: 3.7249 - masked_acc: 0.3418 - val_loss: 3.7016 - val_masked_acc: 0.3313
Epoch 11/100
100/100 [==============================] - ETA: 0s - loss: 3.6800 - masked_acc: 0.3435

a man in a black and a woman and a woman in a black and a man and a black and a woman and a woman and a man and a man in a woman and a man in a woman in a man and a woman and a
three people are standing on a picture of an old man in front of a white
three girls through some road standing on the camera

100/100 [==============================] - 75s 749ms/step - loss: 3.6800 - masked_acc: 0.3435 - val_loss: 3.5332 - val_masked_acc: 0.3464
Epoch 12/100
100/100 [==============================] - ETA: 0s - loss: 3.6286 - masked_acc: 0.3468

a man in a black and a woman in a white shirt and a black and a black shirt and a woman and a woman and a man and a man in a black and white shirt and white and a woman and a woman and a woman in
two people are sitting on a sidewalk
three skateboarder are carrying a bicycle in a face with a face

100/100 [==============================] - 83s 834ms/step - loss: 3.6286 - masked_acc: 0.3468 - val_loss: 3.4627 - val_masked_acc: 0.3466
Epoch 13/100
100/100 [==============================] - ETA: 0s - loss: 3.6015 - masked_acc: 0.3534

a man in a black shirt and a woman and a woman in a white shirt and a woman
a man in a white hat and smiling
a woman wearing a red and a striped shirt standing of laughing after a house

100/100 [==============================] - 76s 762ms/step - loss: 3.6015 - masked_acc: 0.3534 - val_loss: 3.4537 - val_masked_acc: 0.3540
Epoch 14/100
100/100 [==============================] - ETA: 0s - loss: 3.5384 - masked_acc: 0.3543

a man in a black shirt and a woman in a black and a black shirt and white shirt and a woman in a man and a man and a man in a black and white shirt and white shirt and a black shirt and a black shirt is
a man in a black helmet is walking up a man in a small white shirt
a woman taking a very men in long singing

100/100 [==============================] - 81s 813ms/step - loss: 3.5384 - masked_acc: 0.3543 - val_loss: 3.4021 - val_masked_acc: 0.3610
Epoch 15/100
100/100 [==============================] - ETA: 0s - loss: 3.5169 - masked_acc: 0.3552

a man in a black shirt and a woman in a black shirt and a white shirt and a black shirt and a man in a man in a man is standing in a white shirt and a woman in a man in a black and a woman in
a woman wearing a white shirt and white shirt and a woman posing for a building
a man catches a city street are face while are wear attempts to a cigarette at a river

100/100 [==============================] - 83s 831ms/step - loss: 3.5169 - masked_acc: 0.3552 - val_loss: 3.3880 - val_masked_acc: 0.3537
Epoch 16/100
100/100 [==============================] - ETA: 0s - loss: 3.4538 - masked_acc: 0.3650

a man in a black shirt and a woman in a black and a white shirt and white shirt and a woman in a man and a man in a man in a black and a man and white shirt and a man and a woman in a black
a man standing on a wall
a child in in her shirt outfit sitting on a fishing other at a group of a wall

100/100 [==============================] - 75s 756ms/step - loss: 3.4538 - masked_acc: 0.3650 - val_loss: 3.3539 - val_masked_acc: 0.3544
Epoch 17/100
100/100 [==============================] - ETA: 0s - loss: 3.4124 - masked_acc: 0.3660

a man in a black shirt and a man in a black and a man in a white shirt and a man and a man and a man and a man in a man in a man in a man and a man and a man in a man
a man and woman in a woman posing for a woman in front of a man in a woman and white shirt are a a black and a picture
four people sitting at the flower fire

100/100 [==============================] - 78s 778ms/step - loss: 3.4124 - masked_acc: 0.3660 - val_loss: 3.3390 - val_masked_acc: 0.3609
Epoch 18/100
100/100 [==============================] - ETA: 0s - loss: 3.3904 - masked_acc: 0.3619

a man in a black shirt and a woman is standing on a man in a man in a black and a black and a white shirt and a man is standing in a white shirt
a man in a black shirt and black hat is sitting in front of an older man in front of a man in a man is sitting along a man with a large black and white hat
a man sits in winter bathroom on a containing a street

100/100 [==============================] - 78s 783ms/step - loss: 3.3904 - masked_acc: 0.3619 - val_loss: 3.3477 - val_masked_acc: 0.3590
Epoch 19/100
100/100 [==============================] - ETA: 0s - loss: 3.3459 - masked_acc: 0.3705

a man in a black shirt and a woman in a black jacket and a man in a man and a man in a white shirt and a man and a man in a white shirt and a woman in a man in a man is standing in a
a man and a woman in a woman and white tshirt and a woman in front of a man and sunglasses
two people dressed another side near an man and one of street

100/100 [==============================] - 79s 787ms/step - loss: 3.3459 - masked_acc: 0.3705 - val_loss: 3.3411 - val_masked_acc: 0.3623
Epoch 20/100
100/100 [==============================] - ETA: 0s - loss: 3.2832 - masked_acc: 0.3732

a man in a black shirt and a man in a black shirt and a black shirt is standing in a man in a black and a man in a man in a black and a man in a man in a black shirt
a man in a black hat and a woman in a black shirt and a man in a man with a man and a pink and a man in a white shirt is sitting on a man on a man in the shirt is sitting on a shirt
a man on a smiling of camping mohawk is paddles over a table

100/100 [==============================] - 83s 831ms/step - loss: 3.2832 - masked_acc: 0.3732 - val_loss: 3.2589 - val_masked_acc: 0.3744
Epoch 21/100
100/100 [==============================] - ETA: 0s - loss: 3.2645 - masked_acc: 0.3781

a man in a black shirt and a woman in a black shirt and a woman in a man in a man in a white shirt and a man in a man in a white shirt
a man wearing a black shirt and a black hat and a woman with a woman and a man with her woman in a woman
a man is looking grocery bus at outside

100/100 [==============================] - 79s 795ms/step - loss: 3.2645 - masked_acc: 0.3781 - val_loss: 3.2582 - val_masked_acc: 0.3616
Epoch 22/100
100/100 [==============================] - ETA: 0s - loss: 3.2652 - masked_acc: 0.3733

a man in a black shirt and a woman in a white shirt and a woman in a black and a white shirt
a woman in a black shirt and white shirt is standing on a cellphone
a young boy blows white dress is holding jerseys eating looking at a snowy sunny cart in a bench

100/100 [==============================] - 82s 816ms/step - loss: 3.2652 - masked_acc: 0.3733 - val_loss: 3.2275 - val_masked_acc: 0.3729
Epoch 23/100
100/100 [==============================] - ETA: 0s - loss: 3.2089 - masked_acc: 0.3814

a man in a black shirt and a woman in a white shirt and a white shirt is standing in a woman in a white shirt and a man in a black and a white shirt
a woman in a black shirt and a woman in a black jacket is standing on a white shirt
teenagers playing a man wearing white shirt

100/100 [==============================] - 76s 761ms/step - loss: 3.2089 - masked_acc: 0.3814 - val_loss: 3.2327 - val_masked_acc: 0.3686
Epoch 24/100
100/100 [==============================] - ETA: 0s - loss: 3.1893 - masked_acc: 0.3788

a man in a black shirt is standing on a sidewalk
a woman in a black hat is wearing a white shirt with a light
someone is holding a hair next to the stop at a crowd

100/100 [==============================] - 75s 754ms/step - loss: 3.1893 - masked_acc: 0.3788 - val_loss: 3.2234 - val_masked_acc: 0.3673
Epoch 25/100
100/100 [==============================] - ETA: 0s - loss: 3.1900 - masked_acc: 0.3844

a man in a black shirt and white shirt is standing on a white shirt and a white shirt
a man in a dress and a woman and a woman is sitting on a woman in a white shirt and a white shirt and a man in a bench
a man sitting in a black skull hat wearing a surfboard and sunglasses

100/100 [==============================] - 72s 718ms/step - loss: 3.1900 - masked_acc: 0.3844 - val_loss: 3.1768 - val_masked_acc: 0.3722
Epoch 26/100
100/100 [==============================] - ETA: 0s - loss: 3.1686 - masked_acc: 0.3847

a man in a white shirt and a woman in a white shirt is standing in front of a white shirt
a man is standing on a a white bench while a man in a white shirt is holding a crowd of a crowd
a man smokes a blonde listening to making a batman logo with says american pajamas stops behind her hand

100/100 [==============================] - 75s 753ms/step - loss: 3.1686 - masked_acc: 0.3847 - val_loss: 3.1592 - val_masked_acc: 0.3701
Epoch 27/100
100/100 [==============================] - ETA: 0s - loss: 3.1877 - masked_acc: 0.3822

a man and a woman are standing on a bench
a man and two people are standing in a blue and a woman
two girls wearing orange jeans

100/100 [==============================] - 66s 665ms/step - loss: 3.1877 - masked_acc: 0.3822 - val_loss: 3.0931 - val_masked_acc: 0.3768
Epoch 28/100
100/100 [==============================] - ETA: 0s - loss: 3.1877 - masked_acc: 0.3800

a man in a black shirt and a woman in a white shirt and a woman in a white shirt and a white shirt and a white shirt and a white shirt and a white shirt is standing in a black and a man in a black jacket
a man and a woman are sitting on a bench while a woman and a woman standing in a blue and a crowd
the person is outside of some record flags in an old basket

100/100 [==============================] - 73s 728ms/step - loss: 3.1877 - masked_acc: 0.3800 - val_loss: 3.0981 - val_masked_acc: 0.3835
Epoch 29/100
100/100 [==============================] - ETA: 0s - loss: 3.0838 - masked_acc: 0.3891

a man in a black shirt and white shirt and white shirt is standing in front of a crowd
a man wearing a blue shirt and a black hat is holding a sign
a man wearing sunglasses holds sunglasses looking at a picture in the background

100/100 [==============================] - 70s 701ms/step - loss: 3.0838 - masked_acc: 0.3891 - val_loss: 3.0915 - val_masked_acc: 0.3817
Epoch 30/100
100/100 [==============================] - ETA: 0s - loss: 3.0632 - masked_acc: 0.3940

a man in a black shirt and white shirt and white shirt and white shirt and white shirt and white dog
a girl in a black jacket and white shirt and white hat is holding a picture
two men gather for a parade at an and tan ocean

100/100 [==============================] - 75s 754ms/step - loss: 3.0632 - masked_acc: 0.3940 - val_loss: 3.0614 - val_masked_acc: 0.3835
Epoch 31/100
100/100 [==============================] - ETA: 0s - loss: 3.0490 - masked_acc: 0.3948

a man and a woman are sitting on a bench
a man is standing on a table with a picture of people
a man with glasses and standing at the left car

100/100 [==============================] - 72s 717ms/step - loss: 3.0490 - masked_acc: 0.3948 - val_loss: 2.9916 - val_masked_acc: 0.3934
Epoch 32/100
100/100 [==============================] - ETA: 0s - loss: 3.0326 - masked_acc: 0.3944

a man in a black shirt and a woman are standing in a crowd of a man in a crowd
a woman wearing a black shirt and a woman smiles
several adults and white shirts are smiling at the bed

100/100 [==============================] - 69s 693ms/step - loss: 3.0326 - masked_acc: 0.3944 - val_loss: 3.1274 - val_masked_acc: 0.3729
Epoch 33/100
100/100 [==============================] - ETA: 0s - loss: 3.0132 - masked_acc: 0.3942

a man in a black hat and a woman in a white shirt and a black jacket
a man in a black suit looks for a man in a green jacket
sunglasses on a gallery

100/100 [==============================] - 68s 681ms/step - loss: 3.0132 - masked_acc: 0.3942 - val_loss: 3.0431 - val_masked_acc: 0.3825
Epoch 34/100
100/100 [==============================] - ETA: 0s - loss: 3.0025 - masked_acc: 0.3968

a man in a black shirt is standing on a table
two men stand on the camera while a man in front of a busy street
a man in a blue shirt is smiling at the camera

100/100 [==============================] - 68s 680ms/step - loss: 3.0025 - masked_acc: 0.3968 - val_loss: 2.9842 - val_masked_acc: 0.3950
Epoch 35/100
100/100 [==============================] - ETA: 0s - loss: 3.0346 - masked_acc: 0.3923

a man wearing a black jacket and a woman in a white shirt and a white shirt and a woman in a black jacket
a man and woman are standing in a black and white shirt
two costumes are standing in front of a meadow

100/100 [==============================] - 71s 713ms/step - loss: 3.0346 - masked_acc: 0.3923 - val_loss: 2.9601 - val_masked_acc: 0.3905
Epoch 36/100
100/100 [==============================] - ETA: 0s - loss: 2.9936 - masked_acc: 0.3947

a man in a black shirt and white shirt is standing in a white shirt and a white shirt and white shirt and a woman in a woman in a white dog
a woman is standing in a book
two women sit at a picture together as he dusk

100/100 [==============================] - 75s 750ms/step - loss: 2.9936 - masked_acc: 0.3947 - val_loss: 3.0080 - val_masked_acc: 0.3809
Epoch 37/100
100/100 [==============================] - ETA: 0s - loss: 2.9474 - masked_acc: 0.3995

a man and a woman are standing in front of a white building
a man in a black jacket and a brown dog drinking from a man with a man
a man to phone with a sign girl in her hand

100/100 [==============================] - 68s 683ms/step - loss: 2.9474 - masked_acc: 0.3995 - val_loss: 2.9743 - val_masked_acc: 0.3873
Epoch 38/100
100/100 [==============================] - ETA: 0s - loss: 2.9138 - masked_acc: 0.4048

a man in a black shirt and white shirt is standing on a white bench
two men in a red snowsuit is standing next to a sign
two girls gathered next to the camera

100/100 [==============================] - 68s 678ms/step - loss: 2.9138 - masked_acc: 0.4048 - val_loss: 2.9486 - val_masked_acc: 0.3993
Epoch 39/100
100/100 [==============================] - ETA: 0s - loss: 2.8814 - masked_acc: 0.4071

a man in a black shirt and white shirt is standing in a white shirt and a white shirt
a woman in a black jacket and a man in a white cap stands in front of a sign
a woman only a purple striped shirt and woman sitting halloween

100/100 [==============================] - 76s 758ms/step - loss: 2.8814 - masked_acc: 0.4071 - val_loss: 3.0344 - val_masked_acc: 0.3872
Epoch 40/100
100/100 [==============================] - ETA: 0s - loss: 2.8617 - masked_acc: 0.4111

a man in a black shirt and a woman in a white shirt is standing in front of a white shirt
the man is drinking from a man in a blue shirt and a black shirt
three party happily sitting beside a large group of bags touches the table by trees

100/100 [==============================] - 74s 744ms/step - loss: 2.8617 - masked_acc: 0.4111 - val_loss: 2.9666 - val_masked_acc: 0.3905
Epoch 41/100
100/100 [==============================] - ETA: 0s - loss: 2.8488 - masked_acc: 0.4112

a man in a black shirt and a woman in a white shirt and a white shirt is standing in a white shirt and a white shirt and a white shirt and a white shirt is sitting on a woman in a black shirt and a black shirt is
a man in a black shirt and white shirt is holding a picture
a couple pose for something in front of a woman sits on either puck for a having stadium

100/100 [==============================] - 72s 723ms/step - loss: 2.8488 - masked_acc: 0.4112 - val_loss: 2.9747 - val_masked_acc: 0.3869

Plot the loss and accuracy over the training run:

InĀ [73]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()
Out[73]:
<matplotlib.legend.Legend at 0x7855c84141c0>
No description has been provided for this image
InĀ [74]:
plt.plot(history.history['masked_acc'], label='accuracy')
plt.plot(history.history['val_masked_acc'], label='val_accuracy')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()
Out[74]:
<matplotlib.legend.Legend at 0x7855c8414280>
No description has been provided for this image

Attention plots¶

InĀ [98]:
@Captioner.add_method
def run_and_show_attention(self, image, temperature=0.0):
  result_txt = self.simple_gen(image, temperature)
  str_tokens = result_txt.split()
  str_tokens.append('[END]')

  attention_maps = [layer.last_attention_scores for layer in self.decoder_layers]
  attention_maps = tf.concat(attention_maps, axis=0)
  attention_maps = einops.reduce(
      attention_maps,
      'batch heads sequence (height width) -> sequence height width',
      height=7, width=7,
      reduction='mean')

  plot_attention_maps(image/255, str_tokens, attention_maps)
  t = plt.suptitle(result_txt)
  print(result_txt)
  t.set_y(1.05)
InĀ [99]:
image_path = "/content/Profteachingexample.jpg"
image = load_image(image_path)

run_and_show_attention(model, image)
a man in a black shirt and white shirt is standing on a white bench
No description has been provided for this image
InĀ [100]:
import random
import tensorflow as tf
import matplotlib.pyplot as plt

# Set a seed for reproducibility
seed_value = 42
tf.random.set_seed(seed_value)
random.seed(seed_value)

_, test_dataset = flickr8k()

# Shuffle the test dataset
shuffled_test_dataset = test_dataset.shuffle(buffer_size=10000, seed=seed_value)

# Print and see some random photos from the test dataset with captions
num_photos_to_display = 5

for example in shuffled_test_dataset.take(num_photos_to_display):
    photo_path, captions_tensor = example[0], example[1]

    # Convert the captions tensor to a numpy array
    captions = [caption.numpy().decode('utf-8') for caption in captions_tensor]

    # Load and display the original image
    img = tf.io.read_file(photo_path)
    img = tf.image.decode_jpeg(img, channels=3)
    print(photo_path)

    plt.imshow(img)
    plt.title(f"Photo: {photo_path}")
    plt.axis('off')
    plt.show()

    print("Captions:")
    for caption in captions:
        print(f" - {caption}")
    print("\n")
tf.Tensor(b'flickr8k/Flicker8k_Dataset/486917990_72bd4069af.jpg', shape=(), dtype=string)
No description has been provided for this image
Captions:
 - A girl is climbing a rock wall .
 - A person climbs a steep mountain .
 - A person wearing a white hat climbs a rock .
 - A rock climber ascends .
 - Someone climbs a rocks .


tf.Tensor(b'flickr8k/Flicker8k_Dataset/2621415349_ef1a7e73be.jpg', shape=(), dtype=string)
No description has been provided for this image
Captions:
 - A man in a red baseball cap eats a chip .
 - a man wearing a red hat has a potato chip in his mouth
 - A man wearing sunglasses and a red cap putting a chip in his mouth .
 - A man wearing sunglasses and a red hat is opening his mouth wide and eating a chip .
 - A man with sunglasses on puts a chip in his mouth .


tf.Tensor(b'flickr8k/Flicker8k_Dataset/758921886_55a351dd67.jpg', shape=(), dtype=string)
No description has been provided for this image
Captions:
 - A girl wearing a blue dress is sliding down a tube slide .
 - A little girl in a white top is inside a concrete tube .
 - A little girl is sliding down a tunnel smiling .
 - A little girl smiles as she slides down a tube on a sunny day .
 - A young girl in inside a tunnel .


tf.Tensor(b'flickr8k/Flicker8k_Dataset/3692593096_fbaea67476.jpg', shape=(), dtype=string)
No description has been provided for this image
Captions:
 - Airplane emitting heavy red colored smoke .
 - An airplane is flying over the mountain trying to extinguish a fire .
 - A small plane is dropping a red chemical over the mountaintops .
 - Red spray is being ejected by an orange and white plane flying over the hilltops .
 - Small red airplane flies over mountaintop dropping red substance over fire .


tf.Tensor(b'flickr8k/Flicker8k_Dataset/3217187564_0ffd89dec1.jpg', shape=(), dtype=string)
No description has been provided for this image
Captions:
 - A group of dogs racing .
 - A number eight racing dog is beating a number four racing dog slightly in a race .
 - Several dogs wearing muzzles are racing on a track .
 - There are three dogs wearing numbered jerseys running a race .
 - Three race dogs run to finish a race .


InĀ [118]:
#The  image with attention plots

image_path = "/content/3217187564_0ffd89dec1.jpg"
image = load_image(image_path)

run_and_show_attention(model, image)
a dog is running on a track
No description has been provided for this image
InĀ [116]:
#The  image of a girl climbing
image_path = "/content/3692593096_fbaea67476.jpg"
image = load_image(image_path)

run_and_show_attention(model, image)
a person in a red shirt is climbing a rocky mountain
No description has been provided for this image
InĀ [117]:
from nltk.translate.bleu_score import sentence_bleu

# Assuming references is a list of lists where each inner list contains the reference captions
references = [
    ['Airplane emitting heavy red colored smoke.'],
    ['An airplane is flying over the mountain trying to extinguish a fire.'],
    ['A small plane is dropping a red chemical over the mountaintops.'],
    ['Red spray is being ejected by an orange and white plane flying over the hilltops.'],
    ['Small red airplane flies over mountaintop dropping red substance over fire.']
]

# Assuming hypotheses is a list of generated captions
hypotheses = ['a person in a red shirt is climbing a rocky mountain']

# Tokenize the references and hypotheses
references = [[ref[0].split()] for ref in references]
hypotheses = [h.split() for h in hypotheses]

# Calculate BLEU score for each hypothesis-reference pair
individual_scores = [sentence_bleu(references[i], hypotheses[i]) for i in range(len(hypotheses))]

# Calculate the average BLEU score
average_bleu_score = sum(individual_scores) / len(individual_scores)

print("BLEU Score:", average_bleu_score)
print("BLEU individual_scores:", individual_scores)
BLEU Score: 1.0003688322288243e-231
BLEU individual_scores: [1.0003688322288243e-231]
/usr/local/lib/python3.10/dist-packages/nltk/translate/bleu_score.py:552: UserWarning: 
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  warnings.warn(_msg)
InĀ [101]:
#The first image
image_path = "/content/486917990_72bd4069af.jpg"
image = load_image(image_path)

run_and_show_attention(model, image)
a man is climbing a rock wall
No description has been provided for this image
InĀ [109]:
from nltk.translate.bleu_score import sentence_bleu

# Assuming references is a list of lists where each inner list contains the reference captions
references = [
    ['A girl is climbing a rock wall .'],
    ['A person climbs a steep mountain .'],
    ['A person wearing a white hat climbs a rock .'],
    ['A rock climber ascends .'],
    ['Someone climbs a rocks .']
]

# Assuming hypotheses is a list of generated captions
hypotheses = ['a man is climbing a rock wall']

# Tokenize the references and hypotheses
references = [[ref[0].split()] for ref in references]
hypotheses = [h.split() for h in hypotheses]

# Calculate BLEU score for each hypothesis-reference pair
individual_scores = [sentence_bleu(references[i], hypotheses[i]) for i in range(len(hypotheses))]

# Calculate the average BLEU score
average_bleu_score = sum(individual_scores) / len(individual_scores)

print("BLEU Score:", average_bleu_score)
BLEU Score: 0.5329462628216854